import pandas as pd
import matplotlib.pyplot as plt
import eda
%matplotlib inline
import seaborn as sns
data = pd.read_csv("data/cust_seg.csv", low_memory=False)
data.head()
| Unnamed: 0 | fecha_dato | ncodpers | ind_empleado | pais_residencia | sexo | age | fecha_alta | ind_nuevo | antiguedad | ... | ind_hip_fin_ult1 | ind_plan_fin_ult1 | ind_pres_fin_ult1 | ind_reca_fin_ult1 | ind_tjcr_fin_ult1 | ind_valo_fin_ult1 | ind_viv_fin_ult1 | ind_nomina_ult1 | ind_nom_pens_ult1 | ind_recibo_ult1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2015-01-28 | 1375586 | N | ES | H | 35 | 2015-01-12 | 0.0 | 6 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
| 1 | 1 | 2015-01-28 | 1050611 | N | ES | V | 23 | 2012-08-10 | 0.0 | 35 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
| 2 | 2 | 2015-01-28 | 1050612 | N | ES | V | 23 | 2012-08-10 | 0.0 | 35 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
| 3 | 3 | 2015-01-28 | 1050613 | N | ES | H | 22 | 2012-08-10 | 0.0 | 35 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
| 4 | 4 | 2015-01-28 | 1050614 | N | ES | V | 23 | 2012-08-10 | 0.0 | 35 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
5 rows × 48 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 48 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 1000000 non-null int64 1 fecha_dato 1000000 non-null object 2 ncodpers 1000000 non-null int64 3 ind_empleado 989218 non-null object 4 pais_residencia 989218 non-null object 5 sexo 989214 non-null object 6 age 1000000 non-null object 7 fecha_alta 989218 non-null object 8 ind_nuevo 989218 non-null float64 9 antiguedad 1000000 non-null object 10 indrel 989218 non-null float64 11 ult_fec_cli_1t 1101 non-null object 12 indrel_1mes 989218 non-null float64 13 tiprel_1mes 989218 non-null object 14 indresi 989218 non-null object 15 indext 989218 non-null object 16 conyuemp 178 non-null object 17 canal_entrada 989139 non-null object 18 indfall 989218 non-null object 19 tipodom 989218 non-null float64 20 cod_prov 982266 non-null float64 21 nomprov 982266 non-null object 22 ind_actividad_cliente 989218 non-null float64 23 renta 824817 non-null float64 24 ind_ahor_fin_ult1 1000000 non-null int64 25 ind_aval_fin_ult1 1000000 non-null int64 26 ind_cco_fin_ult1 1000000 non-null int64 27 ind_cder_fin_ult1 1000000 non-null int64 28 ind_cno_fin_ult1 1000000 non-null int64 29 ind_ctju_fin_ult1 1000000 non-null int64 30 ind_ctma_fin_ult1 1000000 non-null int64 31 ind_ctop_fin_ult1 1000000 non-null int64 32 ind_ctpp_fin_ult1 1000000 non-null int64 33 ind_deco_fin_ult1 1000000 non-null int64 34 ind_deme_fin_ult1 1000000 non-null int64 35 ind_dela_fin_ult1 1000000 non-null int64 36 ind_ecue_fin_ult1 1000000 non-null int64 37 ind_fond_fin_ult1 1000000 non-null int64 38 ind_hip_fin_ult1 1000000 non-null int64 39 ind_plan_fin_ult1 1000000 non-null int64 40 ind_pres_fin_ult1 1000000 non-null int64 41 ind_reca_fin_ult1 1000000 non-null int64 42 ind_tjcr_fin_ult1 1000000 non-null int64 43 ind_valo_fin_ult1 1000000 non-null int64 44 ind_viv_fin_ult1 1000000 non-null int64 45 ind_nomina_ult1 994598 non-null float64 46 ind_nom_pens_ult1 994598 non-null float64 47 ind_recibo_ult1 1000000 non-null int64 dtypes: float64(9), int64(24), object(15) memory usage: 366.2+ MB
data.corr
<bound method DataFrame.corr of Unnamed: 0 fecha_dato ncodpers ind_empleado pais_residencia sexo \
0 0 2015-01-28 1375586 N ES H
1 1 2015-01-28 1050611 N ES V
2 2 2015-01-28 1050612 N ES V
3 3 2015-01-28 1050613 N ES H
4 4 2015-01-28 1050614 N ES V
... ... ... ... ... ... ...
999995 999995 2015-02-28 1183296 N ES H
999996 999996 2015-02-28 1183295 N ES H
999997 999997 2015-02-28 1183294 N ES V
999998 999998 2015-02-28 1183293 N ES V
999999 999999 2015-02-28 1183289 N ES H
age fecha_alta ind_nuevo antiguedad ... ind_hip_fin_ult1 \
0 35 2015-01-12 0.0 6 ... 0
1 23 2012-08-10 0.0 35 ... 0
2 23 2012-08-10 0.0 35 ... 0
3 22 2012-08-10 0.0 35 ... 0
4 23 2012-08-10 0.0 35 ... 0
... ... ... ... ... ... ...
999995 27 2013-09-25 0.0 22 ... 0
999996 56 2013-09-25 0.0 22 ... 0
999997 39 2013-09-25 0.0 22 ... 0
999998 36 2013-09-25 0.0 22 ... 0
999999 38 2013-09-25 0.0 22 ... 0
ind_plan_fin_ult1 ind_pres_fin_ult1 ind_reca_fin_ult1 \
0 0 0 0
1 0 0 0
2 0 0 0
3 0 0 0
4 0 0 0
... ... ... ...
999995 0 0 0
999996 0 0 0
999997 0 0 0
999998 0 0 0
999999 0 0 0
ind_tjcr_fin_ult1 ind_valo_fin_ult1 ind_viv_fin_ult1 ind_nomina_ult1 \
0 0 0 0 0.0
1 0 0 0 0.0
2 0 0 0 0.0
3 0 0 0 0.0
4 0 0 0 0.0
... ... ... ... ...
999995 0 0 0 0.0
999996 0 0 0 0.0
999997 0 0 0 0.0
999998 0 0 0 0.0
999999 0 0 0 0.0
ind_nom_pens_ult1 ind_recibo_ult1
0 0.0 0
1 0.0 0
2 0.0 0
3 0.0 0
4 0.0 0
... ... ...
999995 0.0 1
999996 0.0 0
999997 0.0 1
999998 0.0 1
999999 0.0 1
[1000000 rows x 48 columns]>
data = data.drop(columns="Unnamed: 0")
data.corr()
| ncodpers | ind_nuevo | indrel | indrel_1mes | tipodom | cod_prov | ind_actividad_cliente | renta | ind_ahor_fin_ult1 | ind_aval_fin_ult1 | ... | ind_hip_fin_ult1 | ind_plan_fin_ult1 | ind_pres_fin_ult1 | ind_reca_fin_ult1 | ind_tjcr_fin_ult1 | ind_valo_fin_ult1 | ind_viv_fin_ult1 | ind_nomina_ult1 | ind_nom_pens_ult1 | ind_recibo_ult1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ncodpers | 1.000000 | 0.002898 | 0.011554 | 0.001253 | NaN | -0.040761 | -0.187022 | -0.088417 | -0.013469 | -0.005766 | ... | -0.100045 | -0.125874 | -0.027066 | -0.189129 | -0.190208 | -0.177782 | -0.073552 | -0.138254 | -0.152622 | -0.163071 |
| ind_nuevo | 0.002898 | 1.000000 | 0.026681 | 0.268051 | NaN | -0.000279 | 0.008165 | -0.000986 | -0.000296 | -0.000139 | ... | -0.002224 | -0.002701 | -0.001432 | -0.003034 | -0.004998 | -0.004265 | -0.001789 | -0.004081 | -0.004497 | -0.005767 |
| indrel | 0.011554 | 0.026681 | 1.000000 | 0.004462 | NaN | 0.001728 | -0.030518 | -0.000745 | -0.000447 | -0.000210 | ... | -0.002745 | -0.003067 | -0.001219 | -0.005014 | -0.005641 | -0.003062 | -0.002699 | -0.005771 | -0.005919 | -0.005887 |
| indrel_1mes | 0.001253 | 0.268051 | 0.004462 | 1.000000 | NaN | -0.000779 | -0.000859 | 0.000615 | -0.000088 | -0.000041 | ... | -0.000659 | -0.000800 | -0.000424 | -0.001801 | -0.001127 | -0.001333 | -0.000530 | -0.000162 | -0.000319 | -0.000852 |
| tipodom | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| cod_prov | -0.040761 | -0.000279 | 0.001728 | -0.000779 | NaN | 1.000000 | 0.025574 | -0.013720 | 0.001998 | 0.000567 | ... | -0.006910 | 0.001241 | -0.012453 | 0.003694 | 0.006096 | 0.012109 | 0.003906 | 0.007072 | 0.008223 | 0.012603 |
| ind_actividad_cliente | -0.187022 | 0.008165 | -0.030518 | -0.000859 | NaN | 0.025574 | 1.000000 | 0.036270 | 0.004421 | 0.005510 | ... | 0.086716 | 0.104347 | 0.031839 | 0.205940 | 0.225788 | 0.172032 | 0.035252 | 0.240178 | 0.253776 | 0.382317 |
| renta | -0.088417 | -0.000986 | -0.000745 | 0.000615 | NaN | -0.013720 | 0.036270 | 1.000000 | 0.002655 | 0.002259 | ... | 0.002591 | 0.022622 | -0.007544 | 0.040447 | 0.029966 | 0.033463 | 0.011001 | 0.015686 | 0.018120 | 0.023436 |
| ind_ahor_fin_ult1 | -0.013469 | -0.000296 | -0.000447 | -0.000088 | NaN | 0.001998 | 0.004421 | 0.002655 | 1.000000 | -0.000083 | ... | 0.001689 | 0.007171 | -0.000910 | 0.007578 | 0.000697 | 0.005423 | 0.013962 | -0.000491 | -0.000858 | 0.003143 |
| ind_aval_fin_ult1 | -0.005766 | -0.000139 | -0.000210 | -0.000041 | NaN | 0.000567 | 0.005510 | 0.002259 | -0.000083 | 1.000000 | ... | -0.000627 | 0.001915 | -0.000427 | 0.012449 | 0.011230 | 0.011909 | -0.000503 | 0.012582 | 0.011807 | 0.012694 |
| ind_cco_fin_ult1 | 0.249102 | -0.006167 | -0.001371 | -0.007860 | NaN | -0.020276 | -0.068090 | -0.023454 | -0.003762 | -0.002674 | ... | -0.090156 | -0.069646 | -0.060289 | -0.187968 | -0.190746 | -0.027543 | -0.025017 | -0.389643 | -0.415986 | -0.201739 |
| ind_cder_fin_ult1 | -0.016762 | -0.000540 | -0.000815 | -0.000160 | NaN | 0.000875 | 0.016487 | 0.002011 | -0.000324 | -0.000152 | ... | -0.000786 | 0.011132 | 0.001356 | 0.008263 | 0.006450 | 0.032099 | 0.004214 | 0.004907 | 0.005335 | 0.007485 |
| ind_cno_fin_ult1 | -0.152739 | -0.004806 | -0.006485 | -0.001246 | NaN | 0.009853 | 0.292279 | 0.019023 | -0.000156 | 0.011944 | ... | 0.172911 | 0.106174 | -0.004438 | 0.316168 | 0.408879 | 0.124390 | 0.030571 | 0.764734 | 0.808487 | 0.557550 |
| ind_ctju_fin_ult1 | -0.007041 | 0.001740 | 0.003927 | 0.005964 | NaN | 0.009212 | 0.053707 | 0.008065 | -0.001564 | -0.000734 | ... | -0.011801 | -0.014282 | -0.008042 | -0.032444 | -0.031261 | -0.023794 | -0.009463 | -0.032599 | -0.033555 | -0.052483 |
| ind_ctma_fin_ult1 | 0.048618 | 0.009878 | 0.009621 | 0.000141 | NaN | 0.005634 | 0.036328 | -0.001259 | -0.001330 | -0.000624 | ... | -0.008615 | -0.007592 | -0.006099 | -0.002925 | 0.006148 | -0.008239 | -0.007923 | 0.007084 | 0.006777 | 0.010471 |
| ind_ctop_fin_ult1 | -0.458529 | -0.009761 | -0.005442 | -0.003040 | NaN | 0.046661 | 0.119618 | 0.040046 | 0.007790 | 0.000671 | ... | 0.002040 | 0.031389 | -0.024707 | 0.018558 | 0.025753 | 0.071213 | 0.050850 | -0.002503 | -0.003805 | 0.000708 |
| ind_ctpp_fin_ult1 | -0.120625 | -0.004254 | -0.003735 | -0.000632 | NaN | 0.013516 | 0.146443 | 0.023205 | 0.004430 | 0.009405 | ... | 0.065755 | 0.054730 | -0.010159 | 0.154765 | 0.160296 | 0.078762 | 0.033961 | 0.182568 | 0.190409 | 0.213863 |
| ind_deco_fin_ult1 | 0.019293 | 0.030335 | 0.016590 | -0.000297 | NaN | 0.002115 | 0.018299 | 0.000405 | -0.000619 | -0.000290 | ... | -0.002502 | -0.001692 | -0.003182 | -0.002627 | -0.003262 | 0.000446 | -0.001320 | -0.000635 | -0.000771 | -0.006241 |
| ind_deme_fin_ult1 | -0.032339 | -0.001250 | 0.001345 | -0.000370 | NaN | 0.004147 | 0.048080 | 0.001311 | -0.000748 | 0.005364 | ... | 0.002075 | 0.010902 | -0.003323 | 0.008073 | 0.012918 | 0.014677 | 0.005511 | 0.015153 | 0.014909 | 0.012665 |
| ind_dela_fin_ult1 | -0.136544 | -0.003584 | -0.001610 | -0.001142 | NaN | 0.016153 | 0.224950 | 0.019788 | 0.001252 | 0.002174 | ... | 0.012901 | 0.058585 | -0.016675 | 0.054675 | 0.102741 | 0.076776 | 0.003210 | 0.086058 | 0.101077 | 0.097571 |
| ind_ecue_fin_ult1 | -0.146466 | -0.004543 | -0.000256 | -0.002014 | NaN | 0.016982 | 0.258731 | 0.024699 | 0.000534 | 0.012915 | ... | 0.066139 | 0.086720 | -0.015879 | 0.184176 | 0.254667 | 0.142069 | 0.005331 | 0.274026 | 0.297263 | 0.304257 |
| ind_fond_fin_ult1 | -0.148243 | -0.003716 | -0.002821 | -0.001101 | NaN | 0.004392 | 0.142834 | 0.025930 | 0.000550 | 0.002895 | ... | 0.014826 | 0.125524 | -0.003132 | 0.056533 | 0.089651 | 0.174048 | 0.015288 | 0.070073 | 0.079887 | 0.088418 |
| ind_hip_fin_ult1 | -0.100045 | -0.002224 | -0.002745 | -0.000659 | NaN | -0.006910 | 0.086716 | 0.002591 | 0.001689 | -0.000627 | ... | 1.000000 | 0.039625 | 0.008341 | 0.082643 | 0.119060 | 0.037856 | 0.003734 | 0.176266 | 0.172848 | 0.185752 |
| ind_plan_fin_ult1 | -0.125874 | -0.002701 | -0.003067 | -0.000800 | NaN | 0.001241 | 0.104347 | 0.022622 | 0.007171 | 0.001915 | ... | 0.039625 | 1.000000 | 0.001860 | 0.072279 | 0.104512 | 0.109479 | 0.020171 | 0.099471 | 0.099287 | 0.103149 |
| ind_pres_fin_ult1 | -0.027066 | -0.001432 | -0.001219 | -0.000424 | NaN | -0.012453 | 0.031839 | -0.007544 | -0.000910 | -0.000427 | ... | 0.008341 | 0.001860 | 1.000000 | 0.025674 | 0.004904 | 0.002526 | -0.002574 | -0.000279 | 0.000035 | -0.004496 |
| ind_reca_fin_ult1 | -0.189129 | -0.003034 | -0.005014 | -0.001801 | NaN | 0.003694 | 0.205940 | 0.040447 | 0.007578 | 0.012449 | ... | 0.082643 | 0.072279 | 0.025674 | 1.000000 | 0.250124 | 0.093774 | 0.049836 | 0.276807 | 0.295216 | 0.334014 |
| ind_tjcr_fin_ult1 | -0.190208 | -0.004998 | -0.005641 | -0.001127 | NaN | 0.006096 | 0.225788 | 0.029966 | 0.000697 | 0.011230 | ... | 0.119060 | 0.104512 | 0.004904 | 0.250124 | 1.000000 | 0.125498 | 0.030404 | 0.363530 | 0.378434 | 0.385731 |
| ind_valo_fin_ult1 | -0.177782 | -0.004265 | -0.003062 | -0.001333 | NaN | 0.012109 | 0.172032 | 0.033463 | 0.005423 | 0.011909 | ... | 0.037856 | 0.109479 | 0.002526 | 0.093774 | 0.125498 | 1.000000 | 0.029905 | 0.106440 | 0.115986 | 0.134370 |
| ind_viv_fin_ult1 | -0.073552 | -0.001789 | -0.002699 | -0.000530 | NaN | 0.003906 | 0.035252 | 0.011001 | 0.013962 | -0.000503 | ... | 0.003734 | 0.020171 | -0.002574 | 0.049836 | 0.030404 | 0.029905 | 1.000000 | 0.031183 | 0.028329 | 0.036890 |
| ind_nomina_ult1 | -0.138254 | -0.004081 | -0.005771 | -0.000162 | NaN | 0.007072 | 0.240178 | 0.015686 | -0.000491 | 0.012582 | ... | 0.176266 | 0.099471 | -0.000279 | 0.276807 | 0.363530 | 0.106440 | 0.031183 | 1.000000 | 0.944898 | 0.479629 |
| ind_nom_pens_ult1 | -0.152622 | -0.004497 | -0.005919 | -0.000319 | NaN | 0.008223 | 0.253776 | 0.018120 | -0.000858 | 0.011807 | ... | 0.172848 | 0.099287 | 0.000035 | 0.295216 | 0.378434 | 0.115986 | 0.028329 | 0.944898 | 1.000000 | 0.503970 |
| ind_recibo_ult1 | -0.163071 | -0.005767 | -0.005887 | -0.000852 | NaN | 0.012603 | 0.382317 | 0.023436 | 0.003143 | 0.012694 | ... | 0.185752 | 0.103149 | -0.004496 | 0.334014 | 0.385731 | 0.134370 | 0.036890 | 0.479629 | 0.503970 | 1.000000 |
32 rows × 32 columns
numbers = data.select_dtypes(['int64', 'float64']).columns.to_list()
fig, ax = plt.subplots(figsize=(30,30))
sns.heatmap(data[numbers].corr(), vmin=-1, vmax=1, cmap=sns.diverging_palette(20, 220, as_cmap=True), annot=True, linewidths=.5)
plt.show()
data.dtypes
fecha_dato object ncodpers int64 ind_empleado object pais_residencia object sexo object age object fecha_alta object ind_nuevo float64 antiguedad object indrel float64 ult_fec_cli_1t object indrel_1mes float64 tiprel_1mes object indresi object indext object conyuemp object canal_entrada object indfall object tipodom float64 cod_prov float64 nomprov object ind_actividad_cliente float64 renta float64 ind_ahor_fin_ult1 int64 ind_aval_fin_ult1 int64 ind_cco_fin_ult1 int64 ind_cder_fin_ult1 int64 ind_cno_fin_ult1 int64 ind_ctju_fin_ult1 int64 ind_ctma_fin_ult1 int64 ind_ctop_fin_ult1 int64 ind_ctpp_fin_ult1 int64 ind_deco_fin_ult1 int64 ind_deme_fin_ult1 int64 ind_dela_fin_ult1 int64 ind_ecue_fin_ult1 int64 ind_fond_fin_ult1 int64 ind_hip_fin_ult1 int64 ind_plan_fin_ult1 int64 ind_pres_fin_ult1 int64 ind_reca_fin_ult1 int64 ind_tjcr_fin_ult1 int64 ind_valo_fin_ult1 int64 ind_viv_fin_ult1 int64 ind_nomina_ult1 float64 ind_nom_pens_ult1 float64 ind_recibo_ult1 int64 dtype: object
data.isnull().sum()
fecha_dato 0 ncodpers 0 ind_empleado 10782 pais_residencia 10782 sexo 10786 age 0 fecha_alta 10782 ind_nuevo 10782 antiguedad 0 indrel 10782 ult_fec_cli_1t 998899 indrel_1mes 10782 tiprel_1mes 10782 indresi 10782 indext 10782 conyuemp 999822 canal_entrada 10861 indfall 10782 tipodom 10782 cod_prov 17734 nomprov 17734 ind_actividad_cliente 10782 renta 175183 ind_ahor_fin_ult1 0 ind_aval_fin_ult1 0 ind_cco_fin_ult1 0 ind_cder_fin_ult1 0 ind_cno_fin_ult1 0 ind_ctju_fin_ult1 0 ind_ctma_fin_ult1 0 ind_ctop_fin_ult1 0 ind_ctpp_fin_ult1 0 ind_deco_fin_ult1 0 ind_deme_fin_ult1 0 ind_dela_fin_ult1 0 ind_ecue_fin_ult1 0 ind_fond_fin_ult1 0 ind_hip_fin_ult1 0 ind_plan_fin_ult1 0 ind_pres_fin_ult1 0 ind_reca_fin_ult1 0 ind_tjcr_fin_ult1 0 ind_valo_fin_ult1 0 ind_viv_fin_ult1 0 ind_nomina_ult1 5402 ind_nom_pens_ult1 5402 ind_recibo_ult1 0 dtype: int64
data.skew()
ncodpers 0.032278 ind_nuevo 45.175718 indrel 29.924507 indrel_1mes 153.451384 tipodom 0.000000 cod_prov -0.151212 ind_actividad_cliente -0.262105 renta 52.234130 ind_ahor_fin_ult1 75.144758 ind_aval_fin_ult1 160.119026 ind_cco_fin_ult1 -1.152401 ind_cder_fin_ult1 41.098090 ind_cno_fin_ult1 2.571915 ind_ctju_fin_ult1 8.391620 ind_ctma_fin_ult1 9.903618 ind_ctop_fin_ult1 1.405709 ind_ctpp_fin_ult1 3.309287 ind_deco_fin_ult1 21.456830 ind_deme_fin_ult1 17.733145 ind_dela_fin_ult1 3.467512 ind_ecue_fin_ult1 2.555226 ind_fond_fin_ult1 5.815247 ind_hip_fin_ult1 9.858534 ind_plan_fin_ult1 8.107362 ind_pres_fin_ult1 14.544812 ind_reca_fin_ult1 3.294845 ind_tjcr_fin_ult1 3.493287 ind_valo_fin_ult1 4.736660 ind_viv_fin_ult1 12.338486 ind_nomina_ult1 3.322353 ind_nom_pens_ult1 3.107782 ind_recibo_ult1 1.792646 dtype: float64
# skew(data)
data.hist(figsize=(20,20), legend=True, grid=False)
plt.show()
numericData = data.select_dtypes(include= ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])
fig, ax = plt.subplots(8,4, figsize=(20,17))
ax = ax.ravel()
position = 0
for i in numericData:
order = numericData[i].value_counts().index
sns.displot(data=numericData, x=i, kde=True, ax=ax[position])
ax[position].set_title(i)
position += 1
# plt.subplots_adjust(hspace=0.7)
plt.show()
categorical_Data = data.select_dtypes('object').columns.to_list()
print(categorical_Data)
position = 0
for i in categorical_Data:
fig, ax = plt.subplots(figsize=(15,15))
ax.tick_params(labelrotation=90)
ax.set_title(i, fontdict={'fontsize':17})
order = data[i].value_counts().index
sns.countplot(data=data, x=i, order=order)
position += 1
plt.subplots_adjust(hspace=0.7)
plt.show()
['fecha_dato', 'ind_empleado', 'pais_residencia', 'sexo', 'age', 'fecha_alta', 'antiguedad', 'ult_fec_cli_1t', 'tiprel_1mes', 'indresi', 'indext', 'conyuemp', 'canal_entrada', 'indfall', 'nomprov']
del numericData, categorical_Data
data['fecha_dato'] = pd.to_datetime(data['fecha_dato'])
data['fecha_alta'] = pd.to_datetime(data['fecha_alta'])
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 47 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fecha_dato 1000000 non-null datetime64[ns] 1 ncodpers 1000000 non-null int64 2 ind_empleado 989218 non-null object 3 pais_residencia 989218 non-null object 4 sexo 989214 non-null object 5 age 1000000 non-null object 6 fecha_alta 989218 non-null datetime64[ns] 7 ind_nuevo 989218 non-null float64 8 antiguedad 1000000 non-null object 9 indrel 989218 non-null float64 10 ult_fec_cli_1t 1101 non-null object 11 indrel_1mes 989218 non-null float64 12 tiprel_1mes 989218 non-null object 13 indresi 989218 non-null object 14 indext 989218 non-null object 15 conyuemp 178 non-null object 16 canal_entrada 989139 non-null object 17 indfall 989218 non-null object 18 tipodom 989218 non-null float64 19 cod_prov 982266 non-null float64 20 nomprov 982266 non-null object 21 ind_actividad_cliente 989218 non-null float64 22 renta 824817 non-null float64 23 ind_ahor_fin_ult1 1000000 non-null int64 24 ind_aval_fin_ult1 1000000 non-null int64 25 ind_cco_fin_ult1 1000000 non-null int64 26 ind_cder_fin_ult1 1000000 non-null int64 27 ind_cno_fin_ult1 1000000 non-null int64 28 ind_ctju_fin_ult1 1000000 non-null int64 29 ind_ctma_fin_ult1 1000000 non-null int64 30 ind_ctop_fin_ult1 1000000 non-null int64 31 ind_ctpp_fin_ult1 1000000 non-null int64 32 ind_deco_fin_ult1 1000000 non-null int64 33 ind_deme_fin_ult1 1000000 non-null int64 34 ind_dela_fin_ult1 1000000 non-null int64 35 ind_ecue_fin_ult1 1000000 non-null int64 36 ind_fond_fin_ult1 1000000 non-null int64 37 ind_hip_fin_ult1 1000000 non-null int64 38 ind_plan_fin_ult1 1000000 non-null int64 39 ind_pres_fin_ult1 1000000 non-null int64 40 ind_reca_fin_ult1 1000000 non-null int64 41 ind_tjcr_fin_ult1 1000000 non-null int64 42 ind_valo_fin_ult1 1000000 non-null int64 43 ind_viv_fin_ult1 1000000 non-null int64 44 ind_nomina_ult1 994598 non-null float64 45 ind_nom_pens_ult1 994598 non-null float64 46 ind_recibo_ult1 1000000 non-null int64 dtypes: datetime64[ns](2), float64(9), int64(23), object(13) memory usage: 358.6+ MB
eda.eda(data)
Preview of data:
| fecha_dato | ncodpers | ind_empleado | pais_residencia | sexo | age | fecha_alta | ind_nuevo | antiguedad | indrel | ... | ind_hip_fin_ult1 | ind_plan_fin_ult1 | ind_pres_fin_ult1 | ind_reca_fin_ult1 | ind_tjcr_fin_ult1 | ind_valo_fin_ult1 | ind_viv_fin_ult1 | ind_nomina_ult1 | ind_nom_pens_ult1 | ind_recibo_ult1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015-01-28 | 1375586 | N | ES | H | 35 | 2015-01-12 | 0.0 | 6 | 1.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
| 1 | 2015-01-28 | 1050611 | N | ES | V | 23 | 2012-08-10 | 0.0 | 35 | 1.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
| 2 | 2015-01-28 | 1050612 | N | ES | V | 23 | 2012-08-10 | 0.0 | 35 | 1.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
3 rows × 47 columns
To check: (1) Total number of entries (2) Column types (3) Any null values <class 'pandas.core.frame.DataFrame'> RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 47 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fecha_dato 1000000 non-null datetime64[ns] 1 ncodpers 1000000 non-null int64 2 ind_empleado 989218 non-null object 3 pais_residencia 989218 non-null object 4 sexo 989214 non-null object 5 age 1000000 non-null object 6 fecha_alta 989218 non-null datetime64[ns] 7 ind_nuevo 989218 non-null float64 8 antiguedad 1000000 non-null object 9 indrel 989218 non-null float64 10 ult_fec_cli_1t 1101 non-null object 11 indrel_1mes 989218 non-null float64 12 tiprel_1mes 989218 non-null object 13 indresi 989218 non-null object 14 indext 989218 non-null object 15 conyuemp 178 non-null object 16 canal_entrada 989139 non-null object 17 indfall 989218 non-null object 18 tipodom 989218 non-null float64 19 cod_prov 982266 non-null float64 20 nomprov 982266 non-null object 21 ind_actividad_cliente 989218 non-null float64 22 renta 824817 non-null float64 23 ind_ahor_fin_ult1 1000000 non-null int64 24 ind_aval_fin_ult1 1000000 non-null int64 25 ind_cco_fin_ult1 1000000 non-null int64 26 ind_cder_fin_ult1 1000000 non-null int64 27 ind_cno_fin_ult1 1000000 non-null int64 28 ind_ctju_fin_ult1 1000000 non-null int64 29 ind_ctma_fin_ult1 1000000 non-null int64 30 ind_ctop_fin_ult1 1000000 non-null int64 31 ind_ctpp_fin_ult1 1000000 non-null int64 32 ind_deco_fin_ult1 1000000 non-null int64 33 ind_deme_fin_ult1 1000000 non-null int64 34 ind_dela_fin_ult1 1000000 non-null int64 35 ind_ecue_fin_ult1 1000000 non-null int64 36 ind_fond_fin_ult1 1000000 non-null int64 37 ind_hip_fin_ult1 1000000 non-null int64 38 ind_plan_fin_ult1 1000000 non-null int64 39 ind_pres_fin_ult1 1000000 non-null int64 40 ind_reca_fin_ult1 1000000 non-null int64 41 ind_tjcr_fin_ult1 1000000 non-null int64 42 ind_valo_fin_ult1 1000000 non-null int64 43 ind_viv_fin_ult1 1000000 non-null int64 44 ind_nomina_ult1 994598 non-null float64 45 ind_nom_pens_ult1 994598 non-null float64 46 ind_recibo_ult1 1000000 non-null int64 dtypes: datetime64[ns](2), float64(9), int64(23), object(13) memory usage: 358.6+ MB None Preview of data with null values:
| fecha_dato | ncodpers | ind_empleado | pais_residencia | sexo | age | fecha_alta | ind_nuevo | antiguedad | indrel | ... | ind_hip_fin_ult1 | ind_plan_fin_ult1 | ind_pres_fin_ult1 | ind_reca_fin_ult1 | ind_tjcr_fin_ult1 | ind_valo_fin_ult1 | ind_viv_fin_ult1 | ind_nomina_ult1 | ind_nom_pens_ult1 | ind_recibo_ult1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015-01-28 | 1375586 | N | ES | H | 35 | 2015-01-12 | 0.0 | 6 | 1.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
| 1 | 2015-01-28 | 1050611 | N | ES | V | 23 | 2012-08-10 | 0.0 | 35 | 1.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
| 2 | 2015-01-28 | 1050612 | N | ES | V | 23 | 2012-08-10 | 0.0 | 35 | 1.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
3 rows × 47 columns
No duplicated entries found
To check:
Unique count of non-numeric data
ind_empleado 5
pais_residencia 113
sexo 2
age 115
antiguedad 249
ult_fec_cli_1t 22
tiprel_1mes 3
indresi 2
indext 2
conyuemp 2
canal_entrada 156
indfall 2
nomprov 52
dtype: int64
Top 5 unique values of ind_empleado
ind_empleado Count
0 N 988260
1 B 387
2 A 287
3 F 282
4 S 2
Top 5 unique values of pais_residencia
pais_residencia Count
0 ES 982264
1 FR 546
2 AR 542
3 DE 487
4 GB 480
Top 5 unique values of sexo
sexo Count
0 V 562000
1 H 427214
Top 5 unique values of age
age Count
0 22 51017
1 23 45366
2 24 38992
3 21 34015
4 44 28800
Top 5 unique values of antiguedad
antiguedad Count
0 21 34320
1 23 23122
2 24 20467
3 12 19155
4 20 18582
Top 5 unique values of ult_fec_cli_1t
ult_fec_cli_1t Count
0 2015-07-01 97
1 2015-07-09 81
2 2015-07-06 76
3 2015-07-21 67
4 2015-07-07 63
Top 5 unique values of tiprel_1mes
tiprel_1mes Count
0 A 547800
1 I 441377
2 P 41
Top 5 unique values of indresi
indresi Count
0 S 982264
1 N 6954
Top 5 unique values of indext
indext Count
0 N 946328
1 S 42890
Top 5 unique values of conyuemp
conyuemp Count
0 N 176
1 S 2
Top 5 unique values of canal_entrada
canal_entrada Count
0 KAT 313944
1 KFC 259035
2 KHE 249254
3 KFA 37733
4 KAS 8190
Top 5 unique values of indfall
indfall Count
0 N 986107
1 S 3111
Top 5 unique values of nomprov
nomprov Count
0 MADRID 360131
1 BARCELONA 87372
2 VALENCIA 46394
3 SEVILLA 43922
4 CORUÑA, A 27545
To check:
Distribution of numeric data
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ncodpers | 1000000.0 | 690596.670395 | 404408.432011 | 15889.00 | 336411.00 | 664476.00 | 1074511.25 | 1379131.00 |
| ind_nuevo | 989218.0 | 0.000489 | 0.022114 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| indrel | 989218.0 | 1.109074 | 3.267624 | 1.00 | 1.00 | 1.00 | 1.00 | 99.00 |
| indrel_1mes | 989218.0 | 1.000085 | 0.012954 | 1.00 | 1.00 | 1.00 | 1.00 | 3.00 |
| tipodom | 989218.0 | 1.000000 | 0.000000 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 |
| cod_prov | 982266.0 | 26.852131 | 12.422924 | 1.00 | 18.00 | 28.00 | 33.00 | 52.00 |
| ind_actividad_cliente | 989218.0 | 0.564971 | 0.495761 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 |
| renta | 824817.0 | 139646.150940 | 238985.824907 | 1202.73 | 71571.84 | 106651.86 | 163432.47 | 28894395.51 |
| ind_ahor_fin_ult1 | 1000000.0 | 0.000177 | 0.013303 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_aval_fin_ult1 | 1000000.0 | 0.000039 | 0.006245 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_cco_fin_ult1 | 1000000.0 | 0.749626 | 0.433229 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 |
| ind_cder_fin_ult1 | 1000000.0 | 0.000591 | 0.024303 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_cno_fin_ult1 | 1000000.0 | 0.105296 | 0.306935 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_ctju_fin_ult1 | 1000000.0 | 0.013623 | 0.115920 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_ctma_fin_ult1 | 1000000.0 | 0.009894 | 0.098975 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_ctop_fin_ult1 | 1000000.0 | 0.212486 | 0.409067 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_ctpp_fin_ult1 | 1000000.0 | 0.072079 | 0.258619 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_deco_fin_ult1 | 1000000.0 | 0.002158 | 0.046404 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_deme_fin_ult1 | 1000000.0 | 0.003150 | 0.056036 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_dela_fin_ult1 | 1000000.0 | 0.066881 | 0.249816 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_ecue_fin_ult1 | 1000000.0 | 0.106267 | 0.308179 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_fond_fin_ult1 | 1000000.0 | 0.027182 | 0.162614 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_hip_fin_ult1 | 1000000.0 | 0.009982 | 0.099410 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_plan_fin_ult1 | 1000000.0 | 0.014553 | 0.119755 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_pres_fin_ult1 | 1000000.0 | 0.004661 | 0.068112 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_reca_fin_ult1 | 1000000.0 | 0.072581 | 0.259448 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_tjcr_fin_ult1 | 1000000.0 | 0.066084 | 0.248429 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_valo_fin_ult1 | 1000000.0 | 0.039378 | 0.194493 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_viv_fin_ult1 | 1000000.0 | 0.006442 | 0.080003 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_nomina_ult1 | 994598.0 | 0.071629 | 0.257873 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_nom_pens_ult1 | 994598.0 | 0.079543 | 0.270584 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ind_recibo_ult1 | 1000000.0 | 0.166275 | 0.372327 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
To check pairwise joint distribution of numeric data
To check time series of numeric data by daily, monthly and yearly frequency Plotting daily data
Plotting monthly data
Plotting yearly data
Plotting daily data
Plotting monthly data
Plotting yearly data